The dataset was retrieved from Kaggle (https://www.kaggle.com/adhok93/presidentialaddress/home) and contains all 58 inauguration speeches of presidents from the George Washington’s first address to present day.
inaug_speeches <- read.csv("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks/inaug_speeches.csv", encoding = "ASCII")##install needed packages
#install.packages(c("psych", "data.table", "stringr", "lubridate","tidyr", "dplyr", "tidytext",
##"ggplot2", "RWeka", "quanteda", "tm","wordcloud", "lsa", "ggcorrplot", "plotly", "rJava", "NLP",
##"openNLP","openNLPmodels.en", "reshape", "tibble","googleVis", "magrittr", "RColorBrewer"))##initial exploration
str(inaug_speeches)## 'data.frame': 58 obs. of 6 variables:
## $ X : int 4 5 6 7 8 9 10 11 12 13 ...
## $ Name : Factor w/ 39 levels "Abraham Lincoln",..: 13 13 23 32 32 20 20 21 21 25 ...
## $ Inaugural.Address: Factor w/ 5 levels "First Inaugural Address",..: 1 4 3 1 4 1 4 1 4 3 ...
## $ Date : Factor w/ 58 levels "20-Jan-97","Friday, January 20, 1961",..: 35 14 29 54 15 30 39 47 22 5 ...
## $ text : Factor w/ 58 levels " ABOUT to add the solemnity of an oath to the obligations imposed by a second call to the station "| __truncated__,..: 58 42 26 43 18 23 1 6 31 7 ...
## $ text2 : Factor w/ 2 levels ""," in some cases as the powers which they respectively claim are often not defined by any distinct lines. Mischi"| __truncated__: 1 1 1 1 1 1 1 1 1 1 ...
inaug_speeches_dt <- as.data.table(inaug_speeches)
names(inaug_speeches)## [1] "X" "Name" "Inaugural.Address"
## [4] "Date" "text" "text2"
##count unique values in each column
apply(inaug_speeches[, c(1:ncol(inaug_speeches))], 2, function(x) length(unique(x)))## X Name Inaugural.Address Date
## 58 39 5 58
## text text2
## 58 2
##counts of each unique president
dt <- inaug_speeches_dt[, .(number_of_distinct_records = uniqueN(text)), by = Name]
##convert to kable tables
kable(dt, escape = F, "html", table.attr = "class='dtable'") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")| Name | number_of_distinct_records |
|---|---|
| George Washington | 2 |
| John Adams | 1 |
| Thomas Jefferson | 2 |
| James Madison | 2 |
| James Monroe | 2 |
| John Quincy Adams | 1 |
| Andrew Jackson | 2 |
| Martin Van Buren | 1 |
| William Henry Harrison | 1 |
| James Knox Polk | 1 |
| Zachary Taylor | 1 |
| Franklin Pierce | 1 |
| James Buchanan | 1 |
| Abraham Lincoln | 2 |
| Ulysses S. Grant | 2 |
| Rutherford B. Hayes | 1 |
| James A. Garfield | 1 |
| Grover Cleveland | 2 |
| Benjamin Harrison | 1 |
| William McKinley | 2 |
| Theodore Roosevelt | 1 |
| William Howard Taft | 1 |
| Woodrow Wilson | 2 |
| Warren G. Harding | 1 |
| Calvin Coolidge | 1 |
| Herbert Hoover | 1 |
| Franklin D. Roosevelt | 4 |
| Harry S. Truman | 1 |
| Dwight D. Eisenhower | 2 |
| John F. Kennedy | 1 |
| Lyndon Baines Johnson | 1 |
| Richard Milhous Nixon | 2 |
| Jimmy Carter | 1 |
| Ronald Reagan | 2 |
| George Bush | 1 |
| Bill Clinton | 2 |
| George W. Bush | 2 |
| Barack Obama | 2 |
| Donald J. Trump | 1 |
##convert strings we will analyze to charachters
inaug_speeches_dt <- inaug_speeches_dt[, text:=as.character(text)]
inaug_speeches_dt <- inaug_speeches_dt[, text2:=as.character(text2)]
##combine text columns in r (only one that couldn't fit into one excel cell)
inaug_speeches_dt$text <- apply(inaug_speeches_dt[ , c("text", "text2") ] ,1 , paste , collapse = "" )
##create duplicate text column (for manipulation)
##trim whitespace and covert text to lowercase
inaug_speeches_dt$text_final <- trimws(inaug_speeches_dt$text)
##convert to regex and add additional text cleanin using stringr
##remove encoding errors using stringr (i.e. <U+AO97>) see <https://stackoverflow.com/questions/39993715/how-to-remove-unicode-u00a6-from-string>
inaug_speeches_dt$text_final <- gsub("\\s*<u\\+\\w+>\\s*", " ", inaug_speeches_dt$text_final)
inaug_speeches_dt$text_final <- gsub("\\s*<U\\+\\w+>\\s*", " ", inaug_speeches_dt$text_final)
##convert whitespace to single space
inaug_speeches_dt$text_final <- gsub("\\s", " ", inaug_speeches_dt$text_final)
##remove non-reg charachters
inaug_speeches_dt$text_final <- gsub("[^[A-Za-z0-9 ][:punct:]]", "", inaug_speeches_dt$text_final)##load require packages
require(stringr)
require(lubridate)
require(tidyr)
require(dplyr)##check unique
#unique(inaug_speeches_dt$Date)
##make copy of original date format for manipulation
inaug_speeches_dt$DateOriginal <- inaug_speeches_dt$Date
##edit missmatched date format (Clinton 1997)
inaug_speeches_dt$Date <- gsub("20-Jan-97", "Monday, January 20, 1997", inaug_speeches_dt$Date)
## split current date formats
inaug_speeches_dt <- separate(inaug_speeches_dt, "Date", c('DayOfWeek', 'MonthDay', 'Year'), sep = ",")
inaug_speeches_dt$MonthDay <- trimws(inaug_speeches_dt$MonthDay)
inaug_speeches_dt$Year <- trimws(inaug_speeches_dt$Year)
##replace NA Values in main date column with
#inaug_speeches_dt$MonthDay <- ifelse(is.na(inaug_speeches_dt$MonthDay), inaug_speeches_dt$DayOfWeek, inaug_speeches_dt$MonthDay)
##recode factors for First Inaugural address
inaug_speeches_dt$Inaugural.Address <- recode_factor(inaug_speeches_dt$Inaugural.Address, "Inaugural Address" = "First Inaugural Address")
##replace non-day of the week with "unknown"
inaug_speeches_dt$DayOfWeek <-
ifelse(inaug_speeches_dt$DayOfWeek == "Monday", "Monday",
ifelse(inaug_speeches_dt$DayOfWeek == "Tuesday", "Tuesday",
ifelse(inaug_speeches_dt$DayOfWeek == "Wednesday", "Wednesday",
ifelse(inaug_speeches_dt$DayOfWeek == "Thursday", "Thursday",
ifelse(inaug_speeches_dt$DayOfWeek == "Friday", "Friday",
ifelse(inaug_speeches_dt$DayOfWeek == "Saturday", "Saturday",
ifelse(inaug_speeches_dt$DayOfWeek == "Sunday", "Sunday", "Unknown")))))))
##check unique days of week
dt <- inaug_speeches_dt[, .(number_of_distinct = uniqueN(text_final)), by = DayOfWeek]
kable(dt, escape = F, "html", table.attr = "class='dtable'") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")| DayOfWeek | number_of_distinct |
|---|---|
| Thursday | 9 |
| Monday | 18 |
| Saturday | 9 |
| Wednesday | 8 |
| Tuesday | 7 |
| Friday | 7 |
##concatenate DayMonth and Year
inaug_speeches_dt$Date <- apply(inaug_speeches_dt[, c('MonthDay', 'Year')], 1, paste, collapse = ",")
inaug_speeches_dt$Date <- gsub(",NA", "", inaug_speeches_dt$Date)
##check unique month/day
#unique(inaug_speeches_dt$MonthDay)
##multidate function in r
multidate <- function(data, formats){
data <- gsub("nd", "", data, perl = TRUE)
a<-list()
for(i in 1:length(formats)){
a[[i]]<- as.Date(data,format=formats[i])
a[[1]][!is.na(a[[i]])]<-a[[i]][!is.na(a[[i]])]
}
a[[1]]
}
###converts disparate date types (list) to common format
inaug_speeches_dt$Date <- multidate(inaug_speeches_dt$Date,
c("%m/%d/%Y","%d.%m.%Y","%d %b %y", "%d %b %y", "%B %d, %Y",
"%B %d %Y", "%d-%b-%y", "%m.%d.%Y", "%d %b, %Y", "%A, %m/%d/%y",
"%A, %B %d, %Y", "%d of %B %Y", "%m%d%y", "%d%m%Y", "%d-%m-%Y",
"%A %B %d %Y", "%d %B, %Y", "%d %B %Y", "%d %B %Y", "%B %d %Y",
"%B %d, %Y", "%A %B %d, %Y", "%h %d %Y", "%m/%d%y", "%d/%m/%Y",
"%d/%m/%Y", "%m%d%y", "%d %B %Y", "%B %d,%Y", "%b. %d,%Y",
"%b-%d-%Y", "%m/%d.%Y"))
###change format to m/d/Y
inaug_speeches_dt$Date <- format(inaug_speeches_dt$Date, "%m/%d/%Y")
#unique(inaug_speeches_dt$Date)##calculate number of charachters
inaug_speeches_dt$nchar <- nchar(inaug_speeches_dt$text_final)
##calculate number of words
nwords <- nwords <- function(string, pseudo=F){
ifelse( pseudo,
pattern <- "\\S+",
pattern <- "[[:alpha:]]+"
)
str_count(string, pattern)
}
##calculate number of words using nwords() function
inaug_speeches_dt$num_words <- nwords(inaug_speeches_dt$text_final)
##create new column for column labels for President
inaug_speeches_dt$PresidentNumber <- seq.int(nrow(inaug_speeches_dt))
inaug_speeches_dt$Speech <- apply(inaug_speeches_dt[, c('PresidentNumber', 'Name')], 1, paste, collapse = " ")## Visualizing speech length
require(tidytext)
require(dplyr)
require(ggplot2)##create barchart in ggplot
inaug_speeches_dt %>%
unnest_tokens(word,text_final) %>%
group_by(Speech) %>%
summarise(num_words=n()) %>%
mutate(mean_words=mean(num_words)) %>%
ggplot(aes(x=Speech,y=(num_words)))+geom_bar(stat = "identity",width=0.5,
aes(fill = inaug_speeches_dt$Inaugural.Address)) +
scale_fill_manual(values = c("red", "goldenrod", "blue", "light blue", "black")) +
theme(axis.text.x = element_text(vjust=1,angle=90)) + theme(legend.position="bottom") +
geom_text(aes(label=inaug_speeches_dt$Year), vjust=0,angle=90,size=2.5,hjust=0)+ylim(c(0,11500)) +
labs(title="Speech Length",
caption="United States: Inauguration Speeches")##checkaverage speech length by inaigural address number
inaug_groups <-inaug_speeches_dt %>%
group_by(Inaugural.Address) %>%
summarise_at(vars(num_words), funs(mean(., na.rm=TRUE))) %>%
arrange(desc(num_words)) %>%
mutate(num_words = round(num_words, 2))
inaug_groups <- as.data.table(inaug_groups)
##convert to kable tables
kable(inaug_groups, escape = F, "html", table.attr = "class='dtable'") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")| Inaugural.Address | num_words |
|---|---|
| First Inaugural Address | 2637.18 |
| Second Inaugural Address | 1836.82 |
| Third Inaugural Address | 1346.00 |
| Fourth Inaugural Address | 559.00 |
## subset columns
inaug_speeches_clean <- subset(inaug_speeches_dt, select = c("PresidentNumber", "Name","Inaugural.Address",
"DayOfWeek", "Year", "Date", "DateOriginal",
"text_final","nchar", "num_words", "Speech"))
##################################### Export Cleaned Data ##########################################
#setwd('C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks')
#write.csv(inaug_speeches_clean, "inaug_speeches_clean.csv")require(RWeka)
require(quanteda)
require(tm)
require(data.table)Adjust N-Gram selction (one or more) in this section
##check structure of the data
#str(inaug_speeches_clean)
###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)
###specify the text column to be used
text <- inaug_speeches_clean$text_final
###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))
################################# tokenize using quanteda ##########################################
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(2, 3), concatenator = "_")
###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams,
pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
##rename rows of matrix to presidents names
row.names(dfm_toksNgrams) <- inaug_speeches_clean$Speech
##frequency dist
term_frequency <- colSums(dfm_toksNgrams)
# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing = TRUE)
term_frequency <- as.data.frame(term_frequency)
# extract column names (words/ngrams) to new column in r
setDT(term_frequency, keep.rownames = TRUE)[]## rn term_frequency
## 1: united_states 158
## 2: let_us 101
## 3: american_people 41
## 4: federal_government 34
## 5: men_women 28
## ---
## 121852: dream_waiting_hopeful 1
## 121853: waiting_hopeful_world 1
## 121854: hopeful_world_god 1
## 121855: world_god_bless 1
## 121856: bless_may_god 1
# alternative method of extract column names (words/ngrams) to new column in r
# term_frequency$names <- rownames(term_frequency)## write term frequency into new file (optional)
#setwd('C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks')
#write.csv(term_frequency, "inaug_term_frequency.csv")require(ggplot2)
library(wordcloud)##subset top 40 rows
term_frequency2 <- term_frequency %>%
arrange(desc(term_frequency)) %>%
head(40)
## Plot a barchart of the most frequent words/phrases
ggplot(term_frequency2, aes(x=reorder(rn, -term_frequency),y=(term_frequency))) +
geom_bar(stat = "identity",width=0.5,
aes(fill = term_frequency2$term_frequency)) +
theme(axis.text.x = element_text(vjust=1,angle=90)) + theme(legend.position="none") +
geom_text(aes(label=term_frequency), vjust=0,angle=90,size=2.5,hjust=0)+
labs(title="Most Common Phrases", caption="United States: Inauguration Speeches")
Adjust N-Gram selction (one or more) in this section
##check structure of the data
#str(inaug_speeches_clean)
###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)
###specify the text column to be used
text <- inaug_speeches_clean$text_final
###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))
################################# tokenize using quanteda ##########################################
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(1), concatenator = "_")
###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams,
pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
##rename rows of matrix to presidents names
row.names(dfm_toksNgrams) <- inaug_speeches_clean$Speech
##frequency dist
term_frequency <- colSums(dfm_toksNgrams)
# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing = TRUE)
term_frequency <- as.data.frame(term_frequency)
# extract column names (words/ngrams) to new column in r
setDT(term_frequency, keep.rownames = TRUE)[]
# alternative method of extract column names (words/ngrams) to new column in r
# term_frequency$names <- rownames(term_frequency)##can compare 2 or more texts (president numbers)
dat <- dfm_toksNgrams[c(57, 58),]
##convert from quanteda dfm to tm DTM
dat <- convert(dat, to = "tm")
##convert from DTM to TDM
dat <- as.TermDocumentMatrix(dat)
dat <- as.matrix(dat)
##create clouds from wordcloud package
comparison.cloud(dat,max.words=80,random.order=FALSE,colors=c("#1F497D","#C0504D", "light blue"),
main="Differences Between Inauguration Speeches")commonality.cloud(dat,random.order=FALSE,max.words=50, color="#1F497D",main="Commonalities in Inauguration Speeches")
A measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. We can use this measure to gauge relative similarity of two texts.
library(lsa)
require(ggplot2)
require(RWeka)
require(quanteda)
require(tm)
require(data.table)#################################### Create tokens for cosine similarity ###########################
##check structure of the data
#str(inaug_speeches_clean)
###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)
###specify the text column to be used
text <- inaug_speeches_clean$text_final
###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))
################################# tokenize using quanteda ##########################################
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(1), concatenator = "_")
###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams,
pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
#################################### Transform Matrix ##############################################
## (see https://www.youtube.com/watch?v=7cwBhWYHgsA)
##can compare 2 or more texts
CosineSim_dfm <- dfm_toksNgrams[c(1:nrow(dfm_toksNgrams)),]
##convert from quanteda dfm to tm DTM
CosineSim_dfm <- convert(CosineSim_dfm, to = "tm")
##check dimensions of dfm
dim(CosineSim_dfm)## [1] 58 9195
## get Cosine similarity in transposed Matrix format
CosineSim_Matrix <- cosine(t(as.matrix(CosineSim_dfm)))
##check dimensions and transpose
dim(CosineSim_Matrix)## [1] 58 58
##convert to data frame
CosineSim_Matrix <- as.data.frame(CosineSim_Matrix)
###################### join to primary dataset (optional) #########################################
##add cosine similarities to original text
## create unique ID to join Presdident Names to Cosine similarity text
CosineSim_Matrix$UniqueResp <- seq.int(nrow(CosineSim_Matrix))
inaug_speeches_clean$UniqueResp <- seq.int(nrow(inaug_speeches_clean))
##cartesian join
#inaug_speeches_wCosine <- merge(x = inaug_speeches_clean, y = CosineSim_Matrix, by = "UniqueResp", all.x = TRUE)require(ggcorrplot)#################################### visualize cosine similarity matrix ############################
##Convert RowNames
##convert row and column names
CosineSim_Matrix <- as.data.frame(CosineSim_Matrix)
row.names(CosineSim_Matrix) <- inaug_speeches_clean$Speech
colnames(CosineSim_Matrix) <- t(inaug_speeches_clean$Speech)
##subset matrix of presidents
CosineSim_Matrix_1900 <- CosineSim_Matrix[c(30:32, 34:37, 41:42, 44:46, 48:49, 51, 52, 54, 56, 58),
c(30:32, 34:37, 41:42, 44:46, 48:49, 51, 52, 54, 56, 58)]
##correlation matrix plot
ggcorrplot(CosineSim_Matrix, type = "lower", lab = TRUE, legend.title = "Cos Sim",
title = "Cosine Similarity of Inauguration Speeches",
show.diag = TRUE, outline.color = "black", lab_size = 2)##correlation matrix plot for presidents 1st Inauguration speeches since 1900
ggcorrplot(CosineSim_Matrix_1900, type = "lower", lab = TRUE, legend.title = "Cos Sim",
title = "Cosine Similarity of Inauguration Speeches",
show.diag = TRUE, outline.color = "black", lab_size = 2)As the visualizations show, there seems to be similar language across the different speeches. Based on our phrase frequencies, we may start to form the idea that these speeches use similar language.
Lexical Diversity refers to “the range of different words used in a text, with a greater range indicating a higher diversity” see https://rdrr.io/github/kbenoit/quanteda/man/textstat_lexdiv.html
require(RWeka)
require(quanteda)
require(tm)##check structure of the data
#str(inaug_speeches_clean)
###create data table for conversion of free text to charachters
inaug_speeches_clean <- as.data.table(inaug_speeches_clean)
###specify the text column to be used
text <- inaug_speeches_clean$text_final
###custom stopwords list
custom_stopwords <- (c(stopwords("english"), "otherwords"))
###tokenize using quanteda
##tokenization and text cleaning
toks <- tokens(text, remove_punct = TRUE,
remove_symbols = TRUE)
toks <- tokens_tolower(toks)
#toks <- tokens_wordstem(toks, language = quanteda_options("language_stemmer"))
toks <- tokens_remove(toks, custom_stopwords)
###create n-grams, specify size (sparsity/computing power are a consideration here)
toksNgrams <- tokens_ngrams(toks, n = c(1), concatenator = "_")
###convert to data frame matrix
dfm_toksNgrams <- as.matrix(toksNgrams)
dfm_toksNgrams <- dfm(toksNgrams)
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, c(stopwords("english"), "high_school"))
###remove leading and trailing stopwords from n-grams
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams,
pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
##calculate lexical diversity
dt <- round(textstat_lexdiv(dfm_toksNgrams, measure = c("all", "TTR", "C", "R", "CTTR", "U", "S",
"Maas"), log.base = 10), 3)
row.names(dt) <- inaug_speeches_clean$Speech
kable(dt, escape = F, "html", table.attr = "class='dtable'") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")| TTR | C | R | CTTR | U | S | Maas | lgV0 | lgeV0 | |
|---|---|---|---|---|---|---|---|---|---|
| 1 George Washington | 0.771 | 0.960 | 19.813 | 14.010 | 70.461 | 0.961 | 0.119 | 9.665 | 22.254 |
| 2 George Washington | 0.935 | 0.984 | 7.366 | 5.209 | 110.921 | 0.972 | 0.095 | 9.849 | 22.678 |
| 3 John Adams | 0.654 | 0.939 | 21.410 | 15.139 | 49.774 | 0.943 | 0.142 | 8.282 | 19.071 |
| 4 Thomas Jefferson | 0.722 | 0.951 | 20.674 | 14.618 | 60.006 | 0.953 | 0.129 | 9.006 | 20.737 |
| 5 Thomas Jefferson | 0.670 | 0.942 | 21.365 | 15.107 | 52.038 | 0.946 | 0.139 | 8.457 | 19.473 |
| 6 James Madison | 0.825 | 0.969 | 19.033 | 13.458 | 89.046 | 0.969 | 0.106 | 10.762 | 24.780 |
| 7 James Madison | 0.805 | 0.966 | 18.864 | 13.339 | 79.716 | 0.965 | 0.112 | 10.178 | 23.436 |
| 8 James Monroe | 0.561 | 0.921 | 22.290 | 15.761 | 40.726 | 0.930 | 0.157 | 7.587 | 17.470 |
| 9 James Monroe | 0.541 | 0.919 | 24.286 | 17.173 | 40.893 | 0.930 | 0.156 | 7.713 | 17.761 |
| 10 John Quincy Adams | 0.634 | 0.937 | 23.479 | 16.602 | 49.684 | 0.943 | 0.142 | 8.405 | 19.353 |
| 11 Andrew Jackson | 0.797 | 0.964 | 18.582 | 13.139 | 76.076 | 0.964 | 0.115 | 9.922 | 22.847 |
| 12 Andrew Jackson | 0.708 | 0.945 | 16.682 | 11.796 | 50.241 | 0.944 | 0.141 | 7.959 | 18.326 |
| 13 Martin Van Buren | 0.619 | 0.937 | 27.471 | 19.425 | 52.113 | 0.945 | 0.139 | 8.820 | 20.308 |
| 14 William Henry Harrison | 0.445 | 0.902 | 27.518 | 19.458 | 36.509 | 0.919 | 0.166 | 7.479 | 17.221 |
| 15 James Knox Polk | 0.508 | 0.912 | 24.283 | 17.171 | 38.380 | 0.924 | 0.161 | 7.491 | 17.249 |
| 16 Zachary Taylor | 0.783 | 0.961 | 18.000 | 12.728 | 69.674 | 0.960 | 0.120 | 9.453 | 21.765 |
| 17 Franklin Pierce | 0.632 | 0.938 | 25.354 | 17.928 | 51.607 | 0.945 | 0.139 | 8.667 | 19.956 |
| 18 James Buchanan | 0.583 | 0.925 | 21.617 | 15.286 | 42.066 | 0.932 | 0.154 | 7.662 | 17.642 |
| 19 Abraham Lincoln | 0.540 | 0.917 | 22.061 | 15.600 | 38.817 | 0.926 | 0.161 | 7.407 | 17.055 |
| 20 Abraham Lincoln | 0.776 | 0.956 | 14.284 | 10.100 | 58.070 | 0.952 | 0.131 | 8.288 | 19.085 |
| 21 Ulysses S. Grant | 0.703 | 0.944 | 16.595 | 11.734 | 49.195 | 0.943 | 0.143 | 7.871 | 18.124 |
| 22 Ulysses S. Grant | 0.705 | 0.946 | 17.588 | 12.437 | 51.368 | 0.946 | 0.140 | 8.122 | 18.702 |
| 23 Rutherford B. Hayes | 0.595 | 0.927 | 20.623 | 14.583 | 42.047 | 0.932 | 0.154 | 7.598 | 17.495 |
| 24 James A. Garfield | 0.602 | 0.930 | 23.023 | 16.280 | 45.488 | 0.937 | 0.148 | 8.035 | 18.501 |
| 25 Grover Cleveland | 0.702 | 0.947 | 19.993 | 14.137 | 55.155 | 0.949 | 0.135 | 8.598 | 19.797 |
| 26 Benjamin Harrison | 0.562 | 0.925 | 25.930 | 18.335 | 44.231 | 0.935 | 0.150 | 8.088 | 18.623 |
| 27 Grover Cleveland | 0.716 | 0.952 | 22.662 | 16.025 | 62.123 | 0.955 | 0.127 | 9.301 | 21.416 |
| 28 William McKinley | 0.555 | 0.922 | 24.628 | 17.415 | 42.480 | 0.932 | 0.153 | 7.870 | 18.120 |
| 29 William McKinley | 0.659 | 0.940 | 21.758 | 15.385 | 50.986 | 0.945 | 0.140 | 8.401 | 19.344 |
| 30 Theodore Roosevelt | 0.687 | 0.939 | 14.597 | 10.322 | 43.268 | 0.935 | 0.152 | 7.224 | 16.635 |
| 31 William Howard Taft | 0.486 | 0.908 | 24.844 | 17.567 | 37.290 | 0.922 | 0.164 | 7.422 | 17.090 |
| 32 Woodrow Wilson | 0.672 | 0.941 | 19.161 | 13.549 | 49.114 | 0.943 | 0.143 | 8.073 | 18.588 |
| 33 Woodrow Wilson | 0.648 | 0.934 | 16.872 | 11.930 | 42.590 | 0.934 | 0.153 | 7.371 | 16.973 |
| 34 Warren G. Harding | 0.591 | 0.930 | 24.605 | 17.398 | 45.951 | 0.938 | 0.148 | 8.163 | 18.796 |
| 35 Calvin Coolidge | 0.558 | 0.923 | 24.318 | 17.195 | 42.416 | 0.932 | 0.154 | 7.847 | 18.069 |
| 36 Herbert Hoover | 0.520 | 0.913 | 22.130 | 15.649 | 37.408 | 0.923 | 0.163 | 7.286 | 16.777 |
| 37 Franklin D. Roosevelt | 0.668 | 0.941 | 20.429 | 14.445 | 50.335 | 0.944 | 0.141 | 8.260 | 19.019 |
| 38 Franklin D. Roosevelt | 0.638 | 0.934 | 19.468 | 13.766 | 45.166 | 0.938 | 0.149 | 7.779 | 17.912 |
| 39 Franklin D. Roosevelt | 0.678 | 0.940 | 16.894 | 11.946 | 46.214 | 0.939 | 0.147 | 7.665 | 17.649 |
| 40 Franklin D. Roosevelt | 0.768 | 0.953 | 12.456 | 8.808 | 51.099 | 0.945 | 0.140 | 7.581 | 17.456 |
| 41 Harry S. Truman | 0.563 | 0.919 | 19.258 | 13.617 | 37.714 | 0.924 | 0.163 | 7.135 | 16.428 |
| 42 Dwight D. Eisenhower | 0.626 | 0.934 | 21.905 | 15.489 | 46.829 | 0.939 | 0.146 | 8.077 | 18.599 |
| 43 Dwight D. Eisenhower | 0.623 | 0.930 | 17.882 | 12.645 | 41.404 | 0.932 | 0.155 | 7.352 | 16.930 |
| 44 John F. Kennedy | 0.646 | 0.933 | 17.123 | 12.108 | 42.681 | 0.934 | 0.153 | 7.399 | 17.037 |
| 45 Lyndon Baines Johnson | 0.624 | 0.928 | 16.586 | 11.728 | 39.610 | 0.929 | 0.159 | 7.101 | 16.350 |
| 46 Richard Milhous Nixon | 0.595 | 0.925 | 19.241 | 13.606 | 40.453 | 0.930 | 0.157 | 7.370 | 16.970 |
| 47 Richard Milhous Nixon | 0.485 | 0.893 | 14.307 | 10.117 | 27.500 | 0.895 | 0.191 | 5.836 | 13.438 |
| 48 Jimmy Carter | 0.689 | 0.942 | 16.848 | 11.913 | 47.651 | 0.941 | 0.145 | 7.774 | 17.900 |
| 49 Ronald Reagan | 0.627 | 0.934 | 21.553 | 15.240 | 46.553 | 0.939 | 0.147 | 8.032 | 18.495 |
| 50 Ronald Reagan | 0.580 | 0.924 | 20.978 | 14.834 | 41.045 | 0.931 | 0.156 | 7.535 | 17.350 |
| 51 George Bush | 0.577 | 0.922 | 19.256 | 13.616 | 38.843 | 0.927 | 0.160 | 7.233 | 16.654 |
| 52 Bill Clinton | 0.643 | 0.934 | 18.160 | 12.841 | 43.889 | 0.936 | 0.151 | 7.579 | 17.451 |
| 53 Bill Clinton | 0.538 | 0.912 | 18.418 | 13.024 | 35.016 | 0.918 | 0.169 | 6.838 | 15.746 |
| 54 George W. Bush | 0.616 | 0.928 | 17.584 | 12.434 | 40.269 | 0.930 | 0.158 | 7.235 | 16.658 |
| 55 George W. Bush | 0.597 | 0.926 | 19.569 | 13.837 | 41.050 | 0.931 | 0.156 | 7.443 | 17.139 |
| 56 Barack Obama | 0.662 | 0.942 | 22.969 | 16.242 | 52.968 | 0.947 | 0.137 | 8.634 | 19.880 |
| 57 Barack Obama | 0.639 | 0.936 | 20.858 | 14.749 | 47.106 | 0.940 | 0.146 | 8.032 | 18.494 |
| 58 Donald J. Trump | 0.595 | 0.922 | 16.560 | 11.709 | 37.004 | 0.923 | 0.164 | 6.876 | 15.833 |
##calculate create new column with chosen lexical diversity
inaug_speeches_clean$LexDiv_r <- textstat_lexdiv(dfm_toksNgrams, measure = c("CTTR"), log.base = 10)require(ggplot2)
require(plotly)##convert strings we will visualize to numeric
inaug_speeches_clean <- inaug_speeches_clean[, Year:=as.numeric(Year)]
## look at number of Inaugurations by Address Number
dt <- inaug_speeches_clean[, .(number_of_distinct = uniqueN(PresidentNumber)), by = Inaugural.Address]
kable(dt, escape = F, "html", table.attr = "class='dtable'") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")| Inaugural.Address | number_of_distinct |
|---|---|
| First Inaugural Address | 39 |
| Second Inaugural Address | 17 |
| Third Inaugural Address | 1 |
| Fourth Inaugural Address | 1 |
###Time Series Plot
##subset
first_inaug <-
inaug_speeches_clean %>%
filter(Inaugural.Address == "First Inaugural Address")
##create plot
LexDiv_plot <- ggplot(data=first_inaug, aes(x=Year, y=LexDiv_r)) +
geom_line(colour = "blue", linejoin = "mitre")+
geom_point(colour = "Black") +
theme(axis.text.x = element_text(vjust=1,angle=90)) +
geom_text(aes(label=Name), vjust=0,angle=90,size=2.5,hjust=0) +
scale_x_continuous(breaks = seq(1789,2017,4))
##show plot
LexDiv_plot##convert and show in plotly
LexDiv_plotly <- ggplotly(LexDiv_plot)
LexDiv_plotly##################### scatter plot
##scatter plot of number of words vs Lexical Diversity Measure
scat <- ggplot(inaug_speeches_clean, aes(x = num_words, y = LexDiv_r)) +
geom_point(aes(color = Year))
##convert to plotly and visualize
scat <- ggplotly(scat)
scatWe find that there have been 17 presidents that have been elected for a second term. We also see thaere does not appear to be a trend in the variety of language used over the years (based on CTTR measure of Lexical Doversity). Upon further examination, we see that the Lexical Diversity measure we used may be biased towards longer responses.
require(rJava)
require(NLP)
require(openNLP)
require(data.table)
require(dplyr)##load in data
#inaug_speeches_clean <- read.csv("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks/inaug_speeches_clean.csv")
###convert raw data to data.table
data <- as.data.table(inaug_speeches_clean)
###convert column(s) with text to be anaalyzed to charachter
data <- data[, text_final:=as.character(text_final)]
###to subset data with less than certain number of charachters (optional)
#data$NumChar <- nchar(data$TextColumn)
#data <- data[NumChar > 3, ]
###check for unique responses to subset by (optional)
#unique(data$ColumnWithFactors)
#data_subset <- data[which(data$ColumnWithFactors == 'Particular Response'),]
##remove NAs
data <- data[!(is.na(data$text_final) | data$text_final==""), ]
#select only text column for analysis
names(inaug_speeches_clean)## [1] "PresidentNumber" "Name" "Inaugural.Address"
## [4] "DayOfWeek" "Year" "Date"
## [7] "DateOriginal" "text_final" "nchar"
## [10] "num_words" "Speech" "UniqueResp"
## [13] "LexDiv_r"
##subset to first inauguration speeches only
##feel free to adjust the exact subset you use
inaug_speeches_first <- inaug_speeches_clean[which(inaug_speeches_clean$Inaugural.Address == 'First Inaugural Address'),]
text <- inaug_speeches_first$text_final
###for tagging
text <- as.String(text)require(magrittr)
require(openNLPmodels.en)
require(rJava)
require(NLP)
require(openNLP)
require(data.table)
library(dplyr)##increase memory limit ~ 7.2gb
memory.limit(size=56500)
##extract words
word_ann <- Maxent_Word_Token_Annotator()
##extract dentences
sent_ann <- Maxent_Sent_Token_Annotator()
pos_ann <- Maxent_POS_Tag_Annotator()
pos_annotations <- annotate(text, list(sent_ann, word_ann, pos_ann))
text_annotations <- annotate(text, list(sent_ann, word_ann))
#head(text_annotations)
text_doc <- AnnotatedPlainTextDocument(text, text_annotations)
#words(text_doc) %>% head(10)
person_ann <- Maxent_Entity_Annotator(kind = "person")
location_ann <- Maxent_Entity_Annotator(kind = "location")
organization_ann <- Maxent_Entity_Annotator(kind = "organization")
date_ann <- Maxent_Entity_Annotator(kind = "date")
pipeline <- list(sent_ann,
word_ann,
person_ann,
location_ann,
organization_ann,
date_ann)
text_annotations <- annotate(text_doc, pipeline)
text_doc <- AnnotatedPlainTextDocument(text, text_annotations)
entities <- function(doc, kind){
s <- doc$content
a <- annotations(doc)[[1]]
if(hasArg(kind)) {
k <- sapply(a$features, '[[', "kind")
s[a[k == kind]]
} else {
a[s[a$tpe == "entity"]]
}
}
person <- as.data.table(entities(text_doc, kind = "person"))
location <- as.data.table(entities(text_doc, kind = "location"))
organization <- as.data.table(entities(text_doc, kind = "organization"))
date <- as.data.table(entities(text_doc, kind = "date"))
##visualize with googleVis
#library(googleVis)
#dfl <- data.frame(table(entities(text_doc, kind = "location")))
#Barp <- gvisColumnChart(dfl)
#plot(Barp)
##Convert NER terms to new data frame with frequency
##################################### location ############################################
location$X <- seq.int(nrow(location))
location$NER_type <- "location"
location_tibble <- location %>%
group_by(V1) %>%
mutate(Count=n_distinct(X)) %>%
arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
location_final <- location_tibble[!duplicated(location_tibble$V1), c(1,3:4)]
#################################### person ############################################
person$X <- seq.int(nrow(person))
person$NER_type <- "person"
person_tibble <- person %>%
group_by(V1) %>%
mutate(Count=n_distinct(X)) %>%
arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
person_final <- person_tibble[!duplicated(person_tibble$V1), c(1, 3:4)]
################################## organization #########################################
organization$X <- seq.int(nrow(organization))
organization$NER_type <- "organization"
organization_tibble <- organization %>%
group_by(V1) %>%
mutate(Count=n_distinct(X)) %>%
arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
organization_final <- organization_tibble[!duplicated(organization_tibble$V1), c(1, 3:4)]
################################# date ##################################################
date$X <- seq.int(nrow(date))
date$NER_type <- "date"
date_tibble <- date %>%
group_by(V1) %>%
mutate(Count=n_distinct(X)) %>%
arrange(desc(Count))
##remove duplicates (specify columns that create make a response unique)
date_final <- date_tibble[!duplicated(date_tibble$V1), c(1, 3:4)]
##rbind columns
NER_combined <- rbind(organization_final, date_final, person_final, location_final)require(ggplot2)
require(wordcloud)##subset top 40 most frequent terms using dplyr pipes
NER_combined2 <- NER_combined %>%
arrange(desc(Count)) %>%
head(40)
ggplot(NER_combined2, aes(x=reorder(V1, -Count),y=(Count))) +
geom_bar(stat = "identity",width=0.5,
aes(fill = NER_combined2$NER_type)) +
theme(axis.text.x = element_text(vjust=1,angle=90)) + theme(legend.position="bottom") +
scale_fill_manual(values = c("red", "goldenrod", "blue", "light blue", "black")) +
geom_text(aes(label=Count), vjust=0,angle=90,size=2.5,hjust=0)+
labs(title="NER: Frequencies", caption="United States: Inauguration Speeches")The creation of this dictionary will allow us to compare to individual responses, as the NER model takes in free text as one chunk to compare to it’s machine-trained model.
############################## NER: Dictionary Creation ############################################
###export NER terms for further analysis
setwd("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks")
write.csv(NER_combined, 'NER_combined.CSV')
###################################### NER Term List #############################################
##returns lists of custom bad words dictionary in r
NER_combined <- read.csv("C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks/NER_combined.CSV")
NER_combined <- as.data.table(NER_combined)
##convert needed text columns to charachters
#str(NER_combined)
##rename column with words
colnames(NER_combined)[colnames(NER_combined) == 'V1'] <- 'NER_terms'
##convert NER terms from factors to charachters
NER_combined <- NER_combined[, NER_terms:=as.character(NER_terms)]
##trim whitespace and convert to lowercase
NER_combined$NER_terms <- trimws(tolower(NER_combined$NER_terms))
##replace spaces with underscores
NER_combined$NER_terms <- gsub(" ", "_", NER_combined$NER_terms)
NER_terms <- as.data.table(NER_combined$NER_terms)
### transform list of bad words to custome dictionary format ###
NER_terms <- as.data.table(NER_combined$NER_terms)
NER_dictionary <- dictionary(as.list(NER_terms))############ apply custome dictionary and output words occuring in new column ######################
## Tokenize, convert to DFM, select N-Grams, and clean
###tokenization packages
require(tibble)
require(quanteda)
require(tm)
require(stringr)
library(data.table)
##increase memory limit (~7.2GB)
memory.limit(size=56500)
################################## Text Preprocessing Pipeline #####################################
###create custome stopwords list
custom_stopwords <- (c(stopwords("english"), "additional", "stopwords"))
##tokenization and dfm (data frame matrix) transformation function
tokenize_dfm <- function(x) {tokenize <- tokens(x, remove_punct = TRUE,
remove_symbols = TRUE)
tok_lower <- tokens_tolower(tokenize)
#tok_stem <- tokens_wordstem(tok_lower, language = quanteda_options("language_stemmer"))
toksNgrams <- tokens_ngrams(tok_lower, n = c(1, 2, 3), concatenator = "_") ###specify n-grams
dfm_toksNgrams <- dfm(toksNgrams)
#dfm_toksNgrams <- dfm_remove(dfm_toksNgrams, custom_stopwords)
## keeps only words occuring in "bad words" dictionary
dfm_toksNgrams <- dfm_select(dfm_toksNgrams, NER_dictionary, selection = "keep")
dfm_toksNgrams <- dfm_remove(dfm_toksNgrams,
pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
rowTotals <- apply(dfm_toksNgrams, 1, sum) ###remove rows with no words
tokens_dfm_new <- dfm_toksNgrams[rowTotals> 0, ]}
###run select data through tokenization/cleaning function
FreeText_tokens_dfm <- tokenize_dfm(inaug_speeches_clean$text_final)
###returns matrix dimensions and sparsity
FreeText_tokens_dfm
###view top terms
head(featnames(FreeText_tokens_dfm))
##returns overall counts of word occurnences
textstat_frequency(FreeText_tokens_dfm)
####transform dfm to data frame
FreeText_tokens_df <- convert(FreeText_tokens_dfm, "data.frame")
FreeText_tokens_df[FreeText_tokens_df == 0] <- FALSE
FreeText_tokens_df[FreeText_tokens_df != 0] <- TRUE
#wc <- droplevels(col(FreeText_tokens_df, as.factor=TRUE)[which(FreeText_tokens_df != NA)])
#FreeText_tokens_df[levels(wc)] <- Map(factor, FreeText_tokens_df[levels(wc)], labels = levels(wc))
#FreeText_tokens_df
##change True/False to column names (words/phrases)
w <- which(FreeText_tokens_df==TRUE,arr.ind=TRUE)
FreeText_tokens_df[w] <- names(FreeText_tokens_df)[w[,"col"]]
##create new column that concatenates all columns (with occurences of words) by individual response
FreeText_tokens_df$NER_terms <- apply(FreeText_tokens_df[, 1:ncol(FreeText_tokens_df)], 1, paste, collapse = ",")
##remove whitespace
#trimws(FreeText_tokens_df$NER_terms)
##Remove "0s" (that represent blanks) in concatenated column
FreeText_tokens_df$NER_terms <- gsub("0,", "", FreeText_tokens_df$NER_terms)
FreeText_tokens_df$NER_terms <- gsub("0", "", FreeText_tokens_df$NER_terms)
##Remove last character "," at end of string
FreeText_tokens_df$NER_terms <- substr(FreeText_tokens_df$NER_terms,1,nchar(FreeText_tokens_df$NER_terms)-1)
dim(FreeText_tokens_df)
##rearrange columns to have common delimited list first
FreeText_tokens_df <- FreeText_tokens_df[c(ncol(FreeText_tokens_df), 1:(ncol(FreeText_tokens_df)-1))]
## check to make necessary columns remain
names(FreeText_tokens_df)
## create new df with list of NER terms
text_NERterms <- FreeText_tokens_df[c(1)]
### count number of bad words occuring in new column
text_NERterms$count_NERterms <- str_count(text_NERterms$NER_terms, ",") + 1
##join to feebackCombined
setDT(text_NERterms, keep.rownames = TRUE)[]
text_NERterms$rn <- gsub("[A-Za-z]", "", text_NERterms$rn)
text_NERterms <- text_NERterms[, rn:=as.integer(rn)]
################################ merge bad words to complete set ##################################
merge_doc <- function(x, y) {merge(x, y,
by.x="PresidentNumber", by.y="rn", all.x = TRUE)}
feedbackCombined_NERterms <- merge_doc(inaug_speeches_clean, text_NERterms)
#str(feedbackCombined_NERterms)
##duplicate list to use for tooltip
feedbackCombined_NERterms$NER_terms_list <- feedbackCombined_NERterms$NER_terms
### count number of bad words occuring in new column
feedbackCombined_NERterms$count_NERterms <- str_count(feedbackCombined_NERterms$NER_terms, ",") + 1
##create placeholder for responses without at least 1 bad word
feedbackCombined_NERterms$NER_terms[which(is.na(feedbackCombined_NERterms$NER_terms))] <- "none"
feedbackCombined_NERterms$NER_terms[which(feedbackCombined_NERterms$NER_terms == c(""))] <- "none"This transformation will allow us to filter by indvidual words.
require(reshape)
require(tidyr)##seperate NER terms into multiple new columns
feedbackCombined_NERterms <- separate(feedbackCombined_NERterms, "NER_terms", c(paste0("NER_terms",1:60)),
sep = ",")
##convert new split columns to data.table
feedbackCombined_NERterms <- as.data.table(feedbackCombined_NERterms)
#names(feedbackCombined_NERterms)
##melt split out columns
feedbackCombined_NERterms <- melt(feedbackCombined_NERterms, measure.vars = c(paste0("NER_terms",1:60)),
value.name = "NER_term")
##remove NAs
feedbackCombined_NERterms <- feedbackCombined_NERterms[!is.na(feedbackCombined_NERterms$NER_term)]##check column names
names(feedbackCombined_NERterms)## [1] "PresidentNumber" "Name" "Inaugural.Address"
## [4] "DayOfWeek" "Year" "Date"
## [7] "DateOriginal" "text_final" "nchar"
## [10] "num_words" "Speech" "UniqueResp"
## [13] "LexDiv_r" "count_NERterms" "NER_terms_list"
## [16] "variable" "NER_term"
##check for unique counts
feedbackCombined_NERterms[, .(number_of_distinct = uniqueN(PresidentNumber)), by = NER_term]
##subset columns########################### Final data export #####################################################
setwd('C:/Users/JO054429/Documents/Consulting_ClientSurveysPractice/Analytics Training Center/Text Analysis Template Scripts/TA Lightning Talks')
write.csv(feedbackCombined_NERterms, "feedbackCombined_NERterms.csv")